# Script written for Homework 2 of 19-704 due on March 31st, 2015
# The goal is to replicate and extend results from Bierens and Ginther (2001)

#Set working directory. You will need to change this for your computer.
#setwd("/Users/Alexandra/Documents/PhD CMU/Courses/S15 _ 19-704 - Data Analysis/Homeworks/HW2")

# First step: install required packages and retrieve the data
install.packages("AER")

install.packages("car", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(car)

install.packages("nloptr")
library(nloptr)

library(AER)
data(CPS1988)

install.packages("quantreg")
library(quantreg)

# ----------------------------------------------------------------------------------------------------
# Replication work - Tables 1.A. and 2.A. from Bierens and Ginther (2001)

# Create a copy of the data set with which to work
HW2 <- data.frame(CPS1988)

# Taking a look at the data
summary(HW2)
head(HW2)

# Re-creating Table 1.A.: LAD estimation results for the standard Mincer type model -------

#Create the variables of interest
rwage <- HW2$wage # Real weekly wages
lwage <- log(rwage)
ed <- HW2$education # Years of schooling (can take values from 0 to 18)
exper <- HW2$experience # Years of experience
expers <- exper*exper

# Create the race dummy variable
HW2.metric <- transform(HW2, ethnicity = as.numeric(ethnicity))
# Command above to change the 'cauc' and 'afam' values of ethnicity into '1' and '2'
HW2.metric$race <- HW2.metric$ethnicity - 1 # Create the race dummy (= 1 if black; 0 otherwise)

# Median regressions
qreg1.race <- rq(lwage ~ race, data = HW2.metric, tau = 0.5)
summary(qreg1.race)
qreg1.ed <- rq(lwage ~ ed, data = HW2.metric, tau = 0.5)
summary(qreg1.ed)
qreg1.exper <- rq(lwage ~ exper, data = HW2.metric, tau = 0.5)
summary(qreg1.exper)
qreg1.expers <- rq(lwage ~ expers, data = HW2.metric, tau = 0.5)
summary(qreg1.expers)
qreg1.1 <- rq(lwage ~ 1, data = HW2.metric, tau = 0.5)
summary(qreg1.1)

# Re-creating Table 2.A. LAD estimation results for the quartic model ---------------------

# Create variables of interest
exper3 <- exper*exper*exper
exper4 <- expers*expers

# Median regressions
qreg2.race <- rq(lwage ~ race, data = HW2.metric, tau = 0.5)
summary(qreg2.race)
qreg2.ed <- rq(lwage ~ ed, data = HW2.metric, tau = 0.5)
summary(qreg2.ed)
qreg2.exper <- rq(lwage ~ exper, data = HW2.metric, tau = 0.5)
summary(qreg2.exper)
qreg2.expers <- rq(lwage ~ expers, data = HW2.metric, tau = 0.5)
summary(qreg2.expers)
qreg2.exper3 <- rq(lwage ~ exper3, data = HW2.metric, tau = 0.5)
summary(qreg2.exper3)
qreg2.exper4 <- rq(lwage ~ exper4, data = HW2.metric, tau = 0.5)
summary(qreg2.exper4)
qreg2.1 <- rq(lwage ~ 1, data = HW2.metric, tau = 0.5)
summary(qreg2.1)

# -----------------------------------------------------------------------------------------------------
# Extension Work

# Question 1 - Histograms of wages, log wages, education, and experiences variables ------

#Create a .png file that will contain everything between png() and dev.off()
png(filename = "HW2hist.png",
    width = 4000, 
    height = 4000,
    res = 400)

# Parameters for plots
par(mfrow = c(2, 2), # mfrow sets the number of rows and columns
    cex = 1.3, # cex scales text and symbols
    mar = c(4, 3, 2, 1), # mar and oma set the inner and outer margins
    oma = c(0, 0, 0, 0))

# Plot a histogram
hist(rwage,
     xlim = c(0, 2500), # the x axis ranges from 50 to 1,000
     ylim = c(0, 5000), # the y axis ranges from 0 to 10,000
     main = "Weekly Wages", # title of the histogram
     breaks = seq(0, 20000, by = 125),
     xlab = "")

hist(lwage,
     xlim = c(0, 10),
     ylim = c(0, 10000),
     main = "Log Wages",
     breaks = seq(0, 10, by = 0.5),
     xlab = "")

hist(ed,
     xlim = c(0, 20),
     ylim = c(0,12000),
     main = "Education (years)",
     breaks = seq(0, 18, by = 1),
     xlab = "")

hist(exper,
     xlim = c(-10, 70),
     ylim = c(0, 5000),
     main = "Experience (years)",
     breaks = seq(-10, 70, by = 4),
     xlab = "")

dev.off()

# Summaries of the variables
summary(rwage)
summary(lwage)
summary(ed)
summary(exper)

# Question 2 - Scatterplots of wages and log wages against education and experience ------

# Create .png file with scatterplots for wages against education
png(filename = "HW2scatterplot.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(2,2),
    cex =1.3)

plot(ed, rwage,
     xlab = "Education (years)",
     ylab = "Weakly Wages",
     ylim = c(0, 20000),
     xlim = c(0, 18),
     pch = 19,
     cex = 1, # set size for points in plot
     col = rgb(0, 0, 0, 0.7))
edwage.reg <- rq(rwage ~ ed, data = HW2.metric, tau = 0.5)
abline(edwage.reg, col = "red")

plot(exper, rwage,
     xlab = "Experience (years)",
     ylab = "Weakly Wages",
     ylim = c(0, 20000),
     xlim = c(-4, 63),
     pch = 19,
     cex = 1, # set size for points in plot
     col = rgb(0, 0, 0, 0.7))
expwage.reg <- rq(rwage ~ exper, data = HW2.metric, tau = 0.5)
abline(expwage.reg, col = "red")

plot(ed, lwage,
     xlab = "Education (years)",
     ylab = "Log Wages",
     ylim = c(2, 10),
     xlim = c(0, 18),
     pch = 19,
     cex = 1, # set size for points in plot
     col = rgb(0, 0, 0, 0.7))
edlwage.reg <- rq(lwage ~ ed, data = HW2.metric, tau = 0.5)
abline(edlwage.reg, col = "red")

plot(exper, lwage,
     xlab = "Experience (years)",
     ylab = "Log Wages",
     ylim = c(2, 10),
     xlim = c(-4, 63),
     pch = 19,
     cex = 1, # set size for points in plot
     col = rgb(0, 0, 0, 0.7))
explwage.reg <- rq(lwage ~ exper, data = HW2.metric, tau = 0.5)
abline(explwage.reg, col = "red")

dev.off()

# Question 3 - Comparison to the normal distribution -------------------------------------

# Simulate a normal random variable (RV) using rnorm(x, y, z)
# The number of observations to draw is x
# The mean of the normal random variable is y
# The standard deviation of the normal random variable is z

# Simulation for wages
x.wage <- length(rwage)
y.wage <- mean(rwage)
z.wage <- sd(rwage)
set.seed(4)
norm.wage <- rnorm(x.wage, y.wage, z.wage)

# Simulation for log wages
x.lwage <- length(lwage)
y.lwage <- mean(lwage)
z.lwage <- sd(lwage)
set.seed(4)
norm.lwage <- rnorm(x.lwage, y.lwage, z.lwage)

# Plot the normal data against wages and log wages
png(filename = "compNormal.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(2,1),
    cex = 1.3)

# The floor and ceiling functions find the integer that is not greater
# (or less than) any element of a vector
min.dw <- floor(min(rwage, norm.wage)) # Find the smallest value
min.dlw <- floor(min(lwage, norm.lwage))
max.dw <- ceiling(max(rwage, norm.wage)) # Find the largest value
max.dlw <- ceiling(max(lwage, norm.lwage))

plot(sort(norm.wage),
     sort(rwage),
     xlim = c(min.dw, max.dw),
     ylim = c(min.dw, max.dw),
     ylab = "Sorted Weekly Wages",
     xlab = "Sorted Draws from a Normal Distribution",
     pch = 19,
     col = rgb(0, 0, 0, .7))
abline(0, 1) #Draw a line on the plot with intercept 0 and slope 1

plot(sort(norm.lwage),
     sort(lwage),
     xlim = c(min.dlw, max.dlw),
     ylim = c(min.dlw, max.dlw),
     ylab = "Sorted Log Wages",
     xlab = "Sorted Draws from a Normal Distribution",
     pch = 19,
     col = rgb(0, 0, 0, 0.7))
abline(0, 1)

dev.off()

# Question 4 - Regression Residuals ------------------------------------------------------

# Calculate OLS regression for Tables 1.A. and 2.A.
race.reg <- lm(lwage ~ race, data = HW2.metric)
ed.reg <- lm(lwage ~ ed, data = HW2.metric)
exper.reg <- lm(lwage ~ exper, data = HW2.metric)
expers.reg <- lm(lwage ~ expers, data = HW2.metric)
exper3.reg <- lm(lwage ~ exper3, data = HW2.metric)
exper4.reg <- lm(lwage ~ exper4, data = HW2.metric)
one.reg <- lm(lwage ~ 1, data = HW2.metric)

# First plot the OLS regression residuals for Tables 1.A. and 2.A. 
png(filename = "HW2_reg_residuals1.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(1, 2),
    cex = 1.3)

# qqplot of the regression residuals from the OLS regression
# id.n = 3 identifies the three largest residuals
qqPlot(race.reg,
       main = "Race",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

qqPlot(ed.reg,
       main = "Education",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

dev.off()

# Second plot the OLS regression residuals for Tables 1.A. and 2.A. 
png(filename = "HW2_reg_residuals2.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(1, 2),
    cex = 1.3)

# qqplot of the regression residuals from the OLS regression
qqPlot(exper.reg,
       main = "Experience",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

qqPlot(expers.reg,
       main = "Squared Experience",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

dev.off()

# Third plot the OLS regression residuals for Tables 1.A. and 2.A. 
png(filename = "HW2_reg_residuals3.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(1, 2),
    cex = 1.3)

# qqplot of the regression residuals from the OLS regression
qqPlot(exper3.reg,
       main = "Cubic Experience",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

qqPlot(exper4.reg,
       main = "Quartic Experience",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

dev.off()

# Fourth plot the OLS regression residuals for Tables 1.A. and 2.A. 
png(filename = "HW2_reg_residuals4.png",
    width = 3000,
    height = 3000,
    res = 300)

par(mfrow = c(1, 2),
    cex = 1.3)

# qqplot of the regression residuals from the OLS regression
qqPlot(one.reg,
       main = "One",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)

dev.off()

# Question 5 - Backcasting and cross-validation ------------------------------------------

png(filename = "backcast1.png",
    width = 4000,
    height = 4000,
    res = 400)

par(mfrow = c(3, 2),
  cex = 1.3)

# Backcasting : Plot the predicted log wages versus observed log wages
plot(predict(qreg1.race, HW2.metric), lwage,
     xlim = c(0, 10),
     ylim = c(0, 10),
     ylab = "Actual Log Wages",
     xlab = "Predicted Log wages",
     main = "Race",
     pch = 19,
     col = rgb(0, 0, 0, .3))
abline(0, 1)

plot(predict(qreg1.ed, HW2.metric), lwage,
     xlim = c(0, 10),
     ylim = c(0, 10),
     ylab = "Actual Log Wages",
     xlab = "Predicted Log wages",
     main = "Education",
     pch = 19,
     col = rgb(0, 0, 0, .3))
abline(0, 1)

plot(predict(qreg1.exper, HW2.metric), lwage,
     xlim = c(0, 10),
     ylim = c(0, 10),
     ylab = "Actual Log Wages",
     xlab = "Predicted Log wages",
     main = "Experience",
     pch = 19,
     col = rgb(0, 0, 0, .3))
abline(0, 1)

plot(predict(qreg1.expers, HW2.metric), lwage,
     xlim = c(0, 10),
     ylim = c(0, 10),
     ylab = "Actual Log Wages",
     xlab = "Predicted Log wages",
     main = "Squared Experience",
     pch = 19,
     col = rgb(0, 0, 0, .3))
abline(0, 1)

plot(predict(qreg1.1, HW2.metric), lwage,
     xlim = c(0, 10),
     ylim = c(0, 10),
     ylab = "Actual Log Wages",
     xlab = "Predicted Log wages",
     main = "One",
     pch = 19,
     col = rgb(0, 0, 0, .3))
abline(0, 1)

dev.off()

# Cross-validation (5-fold) to test the complexity of the model
nfolds <- 5
#divide the cases as evenly as possible
case.folds <- rep(1:nfolds, length.out = nrow(HW2.metric))
#randomly permute the order
set.seed(4)
case.folds <- sample(case.folds)

complex.race <- c() # Create empty vectors
complex.ed <- c()
complex.exper <- c()
complex.expers <- c()
simple <- c()
for (fold in 1:nfolds) {
  # Create training and test cases
  train <- HW2.metric[case.folds != fold, ]
  test <- HW2.metric[case.folds == fold, ]
  # Run the simple and complex models
  simple.train <- rq(log(wage) ~ 1, data = train, tau = 0.5)
  complex.race.train <- rq(log(wage) ~ race, data = train, tau = 0.5)
  complex.ed.train <- rq(log(wage) ~ education, data = train, tau = 0.5)
  complex.exper.train <- rq(log(wage) ~ experience, data = train, tau = 0.5)
  complex.expers.train <- rq(log(wage) ~ experience*experience, data = train, tau = 0.5)
  #Generate test MSEs
  simple.test <- (log(test$wage) - predict(simple.train, test))^2
  complex.race.test <- (log(test$wage) - predict(complex.race.train, test))^2
  complex.ed.test <- (log(test$wage) - predict(complex.ed.train, test))^2
  complex.exper.test <- (log(test$wage) - predict(complex.exper.train, test))^2
  complex.expers.test <- (log(test$wage) - predict(complex.expers.train, test))^2
  # Calculate rMSEs
  rMSEtest1 <- sqrt(sum(simple.test)/length(simple.test))
  rMSEtest.race <- sqrt(sum(complex.race.test)/length(complex.race.test))
  rMSEtest.ed <- sqrt(sum(complex.ed.test)/length(complex.ed.test))
  rMSEtest.exper <- sqrt(sum(complex.exper.test)/length(complex.exper.test))
  rMSEtest.expers <- sqrt(sum(complex.expers.test)/length(complex.expers.test))
  # Append the rMSE from this iteration to vectors
  simple <- append(simple, rMSEtest1)
  complex.race <- append(complex.race, rMSEtest.race)
  complex.ed <- append(complex.ed, rMSEtest.ed)
  complex.exper <- append(complex.exper, rMSEtest.exper)
  complex.expers <- append(complex.expers, rMSEtest.expers)
}
#Average the MSEs
simple.avg <- mean(simple)
complex.race.avg <- mean(complex.race)
complex.ed.avg <- mean(complex.ed)
complex.exper.avg <- mean(complex.exper)
complex.expers.avg <- mean(complex.expers)
